This first part contains many uninteresting things... we first load plenty of libraries we will use later
from SPARQLWrapper import SPARQLWrapper, JSON
from rdflib import ConjunctiveGraph, Namespace, Literal, RDF, RDFS, BNode, URIRef, XSD, Variable
import operator
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import urllib2, StringIO, csv
import re
%matplotlib inline
NG_TEMPLATE = 'http://lod.cedar-project.nl/resource/v2/TABLE'
END_POINT = 'http://lod.cedar-project.nl:8080/sparql/cedar'
This function will be used to clean text
def clean_string(text):
# Remove some extra things
text_clean = text.replace('.', '').replace('_', ' ').lower()
# Shrink spaces
text_clean = re.sub(r'\s+', ' ', text_clean)
# Remove lead and trailing whitespaces
text_clean = text_clean.strip()
return text_clean
We then get the list of tables
url = 'https://raw.githubusercontent.com/cgueret/Harmonize/master/tables.txt'
tables = [table.strip() for table in StringIO.StringIO(urllib2.urlopen(url).read())]
We first get the raw data out of the SPARQL end point (this takes a while !)
headers = {}
for table in tables:
named_graph = NG_TEMPLATE.replace('TABLE', table)
sparql = SPARQLWrapper(END_POINT)
query = """
prefix tablink: <http://example.org/ns#>
prefix skos: <http://www.w3.org/2004/02/skos/core#>
select distinct ?header ?label ?parent from <GRAPH> where {
?header a tablink:ColumnHeader;
tablink:subColHeaderOf ?parent;
skos:prefLabel ?label.
}
""".replace('GRAPH',named_graph)
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
table_headers = {}
for result in results["results"]["bindings"]:
h = URIRef(result['header']['value'])
p = URIRef(result['parent']['value'])
l = Literal(result['label']['value'])
table_headers[h] = {'parent':p, 'label':l}
headers[table] = table_headers
We first generate the table
keyword_count = {}
for (table, table_headers) in headers.iteritems():
for (header, data) in table_headers.iteritems():
clean_label = clean_string(data['label'])
for label_part in clean_label.split(' '):
if label_part not in keyword_count:
keyword_count[label_part] = 0
keyword_count[label_part] = keyword_count[label_part] + 1
sorted_keyword_count = sorted(keyword_count.iteritems(), key=operator.itemgetter(1), reverse=True)
We print the top 15 results
for (k,v) in sorted_keyword_count[:15]:
print '{0:<8} => {1}'.format(v, k)
9642 => m 9509 => v 4775 => totaal 4673 => gehuwd 3630 => in 2841 => vrouwen 2720 => mannen 2565 => en 2545 => ongehuwd 2320 => geweest 1825 => der 1768 => van 1529 => de 1389 => o 1332 => provincie
We can also plot the complete data to see how the frequency evolves
values = [v for (k,v) in sorted_keyword_count]
plt.semilogy(range(len(values)), values, 'ro')
plt.axis([0, len(values), 0, values[0]])
plt.xlabel('Index of the dimension')
plt.ylabel('Frequency')
plt.show()
New let's generate a set of keywords per table
keyword_sets = {}
for (table, table_headers) in headers.iteritems():
keyword_sets[table] = set()
for (header, data) in table_headers.iteritems():
clean_label = clean_string(data['label'])
for label_part in clean_label.split(' '):
keyword_sets[table].add(label_part)
and use this set to create a networked based on the size of the set intersection
# Create the edges
edges = []
sorted_tb_names = sorted(keyword_sets.keys())
maximum_size = 0.0
for i in range(0, len(sorted_tb_names)):
for j in range(i+1, len(sorted_tb_names)):
table_a = sorted_tb_names[i]
table_b = sorted_tb_names[j]
size = float(len(keyword_sets[table_a] & keyword_sets[table_b]))
if size > 0:
edges.append((table_a, table_b, size))
if size > maximum_size:
maximum_size = size
# Normalize the weights
for i in range(0, len(edges)):
(a,b,s) = edges[i]
edges[i] = (a, b, s/maximum_size)
Now we can load this network in networkx and plot it :-)
# Create the graph
G=nx.Graph()
# Add the nodes
for node in sorted_tb_names:
G.add_node(node)
# Add the edges
for (src,dst,weight) in edges:
if weight > 0.025: # Filter a bit to get less edges
G.add_edge(src, dst, {'weight': weight})
# Position the nodes
pos=nx.spring_layout(G, iterations=100)
# Color them according to their type (beroep in blue, volk in green and woning in red)
color_map = {'BRT': 'b', 'VT' : 'g', 'WT' : 'r'}
typed_nodes = {}
for node in sorted_tb_names:
node_type = node.split('_')[0]
typed_nodes.setdefault(node_type, []).append(node)
# Plot the network
plt.figure(figsize=(15,15))
plt.axis('off')
nx.draw_networkx_edges(G, pos, alpha=0.4, width=1)
for node_type in color_map.keys():
nx.draw_networkx_nodes(G, pos, alpha=0.6, node_size=80, nodelist=typed_nodes[node_type], node_color=color_map[node_type])